05_analysis_1

Variables analysis

Loading libraries

library("PerformanceAnalytics")
library("randomForest")
library("tidyverse")
library("caret")
library("dplyr")

Setting seed for reproducibility

set.seed(123)

Loading augmented data

kidney_augmented <- read_rds("../data/03_data_augment.rds")

Correlation plot

The corr plot shows the correlation between all the numeric variables in the dataset, from there we can inspect if there is correlation between variables and how strong.

kidney_augmented |>
  select(age, blood_pressure, blood_glucose_random, 
         blood_urea, serum_creatinine, sodium, potassium, 
         hemoglobin, packed_cell_volume, white_blood_cell_count, 
         red_blood_cell_count, GFR_male, GFR_female, GFR_average) |>
  chart.Correlation(histogram = TRUE, pch = 19)

Random forest analysis

Using random forest we are inspecting the variable importance, to further select our variables. This also includes factor variables.

data <- kidney_augmented |> 
  na.omit() |>
  select(-c(GFR_female, GFR_male, GFR_average, cdk_stage, specific_gravity))

idx <- data |>
  pull(classification) |>
  createDataPartition(p = 0.6, list = F)

train <- data[idx,]
test <- data[-idx,]

rf1 <- train |> 
  randomForest(classification ~ ., data = _, importance = T, ntree = 600)

rf1$err.rate[,1] |>
  plot(type='l')

Visualizing the confusion matrix

This is used to see the model performance on the test set.

truth <- test |>
  pull(classification)
pred  <- rf1 |>
  predict(test)

confusionMatrix(truth, pred)
Confusion Matrix and Statistics

          Reference
Prediction ckd notckd
    ckd     17      0
    notckd   0     46
                                     
               Accuracy : 1          
                 95% CI : (0.9431, 1)
    No Information Rate : 0.7302     
    P-Value [Acc > NIR] : 2.485e-09  
                                     
                  Kappa : 1          
                                     
 Mcnemar's Test P-Value : NA         
                                     
            Sensitivity : 1.0000     
            Specificity : 1.0000     
         Pos Pred Value : 1.0000     
         Neg Pred Value : 1.0000     
             Prevalence : 0.2698     
         Detection Rate : 0.2698     
   Detection Prevalence : 0.2698     
      Balanced Accuracy : 1.0000     
                                     
       'Positive' Class : ckd        
                                     

Variables importance plot

This plot is used to see how much each variable is influencing the accuracy of the model.

varImpPlot(rf1)

png("../results/05_rf1_varImp.png", width = 800, height = 1200, type = "cairo")
varImpPlot(rf1)
dev.off()
svg 
  2 

Repeating random forest analysis with only important variables

kidney_augmented <- read_rds("../data/03_data_augment.rds")

data <- kidney_augmented |> 
  na.omit() |>
  select(-c(GFR_female, GFR_male, GFR_average, cdk_stage, specific_gravity))

idx <- data |> 
  pull(classification) |>
  createDataPartition(p = 0.6, list = F)

train <- data[idx,]
test <- data[-idx,]

rf2 <- train |> 
  randomForest(classification ~ packed_cell_volume + hemoglobin +
                 serum_creatinine + red_blood_cell_count + albumin + hypertension + diabetes_mellitus,
               data = _, 
               importance = T, 
               ntree = 600)

rf2$err.rate[,1] |>
  plot(type='l')

Visualizing the confusion matrix

This is used to see the model performance on the test set.

truth <- test |>
  pull(classification)
pred  <- rf2 |>
  predict(test)

confusionMatrix(truth, pred)
Confusion Matrix and Statistics

          Reference
Prediction ckd notckd
    ckd     17      0
    notckd   0     46
                                     
               Accuracy : 1          
                 95% CI : (0.9431, 1)
    No Information Rate : 0.7302     
    P-Value [Acc > NIR] : 2.485e-09  
                                     
                  Kappa : 1          
                                     
 Mcnemar's Test P-Value : NA         
                                     
            Sensitivity : 1.0000     
            Specificity : 1.0000     
         Pos Pred Value : 1.0000     
         Neg Pred Value : 1.0000     
             Prevalence : 0.2698     
         Detection Rate : 0.2698     
   Detection Prevalence : 0.2698     
      Balanced Accuracy : 1.0000     
                                     
       'Positive' Class : ckd        
                                     

Variables importance plot

This plot is used to see how much each variable is influencing the accuracy of the model.

varImpPlot(rf2)

png("../results/05_rf2_varImp.png", width = 800, height = 1200, type = "cairo")
varImpPlot(rf2)
dev.off()
svg 
  2